Semi Parametric Analysis

Authors

Dr. Muhammad Za’im bin Mohd Samsuri

Dr. Tengku Muhammad Hudzaifah bin Tengku Mokhtar

Dr. Muhammad Abdul Hafiz bin Kamarul

Published

June 15, 2025

knitr::include_graphics("group.gif")

1 Load required packages

library(DT)
library(lmtest)
library(caret)
library (pROC)
library(generalhoslem)
library(ResourceSelection)
library(survival)
library(survminer)
library(haven)
library(broom)
library(tidyverse)
library(lubridate)
library(gtsummary)
library(ggplot2)
library(corrplot)
library(lubridate)

2 Load dataset

data2 <- read_dta("stroke_fatality.dta")
glimpse(data2)
Rows: 226
Columns: 49
$ sex          <dbl+lbl> 2, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1,…
$ race         <dbl+lbl> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ doa          <date> 2011-01-01, 2011-01-03, 2011-01-06, 2011-01-16, 2011-01-…
$ dod          <date> 2011-01-05, 2011-01-06, 2011-01-22, 2011-02-08, 2011-01-…
$ time         <dbl> 4, 3, 16, 23, 5, 4, 22, 14, 4, 2, 22, 2, 43, 2, 4, 4, 3, …
$ time2        <dbl> 4, 3, 16, 23, 5, 4, 22, 14, 4, 2, 22, 2, 43, 2, 4, 4, 3, …
$ status2      <dbl+lbl> 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1,…
$ gcs          <dbl> 15, 15, 15, 11, 15, 7, 5, 13, 15, 15, 10, 15, 14, 9, 15, …
$ sbp          <dbl> 150, 152, 231, 110, 199, 190, 145, 161, 222, 161, 149, 15…
$ dbp          <dbl> 87, 108, 117, 79, 134, 101, 102, 96, 129, 107, 90, 61, 95…
$ hr           <dbl> 92, 87, 64, 90, 72, 63, 102, 81, 72, 94, 59, 81, 61, 120,…
$ dm           <dbl+lbl>  1, NA,  1,  8, NA,  2,  1,  1,  2,  2,  2,  2, NA, N…
$ hpt          <dbl+lbl>  1,  1,  1,  1,  1,  2,  1,  1,  1,  2,  1,  2,  1,  …
$ ckd          <dbl+lbl> NA, NA,  2,  8, NA,  2,  8,  2,  2,  2,  2,  2, NA, N…
$ af           <dbl+lbl> NA, NA,  2,  8, NA,  2,  8,  2,  2,  2,  2,  2, NA, N…
$ hd           <dbl+lbl> NA, NA,  2,  8, NA,  2,  8,  1,  2,  2,  2,  2, NA,  …
$ dyslipid     <dbl+lbl> NA, NA,  1,  8, NA,  2,  8,  2,  2,  2,  2,  2, NA, N…
$ tia          <dbl+lbl> NA, NA,  2,  8, NA,  2,  8,  2,  2,  2,  2,  2, NA, N…
$ smoker       <dbl+lbl> NA, NA,  2,  8, NA,  2,  8,  2,  2,  1,  2,  2, NA, N…
$ hb           <dbl> 10.4, 13.0, 11.0, 14.3, 15.7, 11.7, 13.4, 11.8, 12.6, 16.…
$ plat         <dbl> 249, 156, 179, 233, 351, 133, 290, 251, 196, 188, 139, 30…
$ wbc          <dbl> 12.5, 7.4, 22.4, 9.6, 18.7, 11.3, 15.8, 8.5, 9.0, 9.5, 11…
$ na           <dbl> 138, 132, 135, 132, 138, 140, 134, 135, 129, 137, 141, 13…
$ potas        <dbl> 3.6, 4.1, 4.7, 3.8, 3.8, 3.0, 4.1, 4.0, 4.0, 3.7, 4.1, 4.…
$ gluc         <dbl> NA, 5.1, NA, NA, NA, NA, NA, NA, NA, 5.5, NA, NA, NA, NA,…
$ cbs          <dbl> 11.4, NA, 18.6, 6.8, 6.5, 8.4, 13.4, 14.9, 6.6, 5.8, 7.7,…
$ chol         <dbl> NA, 4.7, NA, NA, NA, NA, NA, NA, NA, 6.9, NA, NA, NA, NA,…
$ tg           <dbl> NA, 1.1, NA, NA, NA, NA, NA, NA, NA, 1.0, NA, NA, NA, NA,…
$ urea         <dbl> 5.1, 8.4, 11.0, 7.4, 3.8, 4.0, 3.3, 5.6, 5.8, 5.8, 7.3, 7…
$ referral     <dbl> 4, 4, 4, 3, 1, 2, 4, 3, 3, 4, 1, 4, 1, 3, 4, 3, 3, 1, 4, …
$ transport    <dbl> 2, 2, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, …
$ age2         <dbl> 69, 64, 63, 67, 66, 78, 55, 65, 67, 56, 50, 78, 59, 83, 8…
$ sex2         <dbl+lbl> 2, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1,…
$ marital2     <dbl+lbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 3, 1,…
$ status1b     <dbl+lbl> 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1,…
$ status2b     <dbl+lbl> 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1,…
$ status3b     <dbl+lbl> 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,…
$ hpt2         <dbl+lbl>  1,  1,  1,  1,  1,  0,  1,  1,  1,  0,  1,  0,  1,  …
$ marital3     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, …
$ race2        <dbl+lbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ sex3         <dbl+lbl> 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,…
$ referral2    <dbl> 3, 3, 3, 2, 0, 1, 3, 2, 2, 3, 0, 3, 0, 2, 3, 2, 2, 0, 3, …
$ referral2cat <dbl+lbl> 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,…
$ icd10cat     <dbl+lbl> 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,…
$ icd10cat2    <dbl+lbl> 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,…
$ icd10cat3    <dbl+lbl> 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 1,…
$ dm2cat       <dbl> 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, …
$ hpt2cat      <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, …
$ dyslipid2cat <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

3 Check the structure of the dataset

convert to factor

data2<-data2 %>% mutate_if(is.labelled,~ as_factor(.))
glimpse(data2)
Rows: 226
Columns: 49
$ sex          <fct> female, male, female, female, male, female, female, femal…
$ race         <fct> malay, malay, malay, malay, malay, chinese, malay, malay,…
$ doa          <date> 2011-01-01, 2011-01-03, 2011-01-06, 2011-01-16, 2011-01-…
$ dod          <date> 2011-01-05, 2011-01-06, 2011-01-22, 2011-02-08, 2011-01-…
$ time         <dbl> 4, 3, 16, 23, 5, 4, 22, 14, 4, 2, 22, 2, 43, 2, 4, 4, 3, …
$ time2        <dbl> 4, 3, 16, 23, 5, 4, 22, 14, 4, 2, 22, 2, 43, 2, 4, 4, 3, …
$ status2      <fct> dead, alive, alive, alive, alive, dead, alive, dead, aliv…
$ gcs          <dbl> 15, 15, 15, 11, 15, 7, 5, 13, 15, 15, 10, 15, 14, 9, 15, …
$ sbp          <dbl> 150, 152, 231, 110, 199, 190, 145, 161, 222, 161, 149, 15…
$ dbp          <dbl> 87, 108, 117, 79, 134, 101, 102, 96, 129, 107, 90, 61, 95…
$ hr           <dbl> 92, 87, 64, 90, 72, 63, 102, 81, 72, 94, 59, 81, 61, 120,…
$ dm           <fct> yes, NA, yes, 8, NA, 2, yes, yes, 2, 2, 2, 2, NA, NA, yes…
$ hpt          <fct> yes, yes, yes, yes, yes, 2, yes, yes, yes, 2, yes, 2, yes…
$ ckd          <fct> NA, NA, 2, 8, NA, 2, 8, 2, 2, 2, 2, 2, NA, NA, 8, NA, NA,…
$ af           <fct> NA, NA, 2, 8, NA, 2, 8, 2, 2, 2, 2, 2, NA, NA, 8, NA, NA,…
$ hd           <fct> NA, NA, 2, 8, NA, 2, 8, yes, 2, 2, 2, 2, NA, yes, yes, NA…
$ dyslipid     <fct> NA, NA, yes, 8, NA, 2, 8, 2, 2, 2, 2, 2, NA, NA, 8, NA, N…
$ tia          <fct> NA, NA, 2, 8, NA, 2, 8, 2, 2, 2, 2, 2, NA, NA, 8, NA, NA,…
$ smoker       <fct> NA, NA, 2, 8, NA, 2, 8, 2, 2, yes, 2, 2, NA, NA, 8, NA, N…
$ hb           <dbl> 10.4, 13.0, 11.0, 14.3, 15.7, 11.7, 13.4, 11.8, 12.6, 16.…
$ plat         <dbl> 249, 156, 179, 233, 351, 133, 290, 251, 196, 188, 139, 30…
$ wbc          <dbl> 12.5, 7.4, 22.4, 9.6, 18.7, 11.3, 15.8, 8.5, 9.0, 9.5, 11…
$ na           <dbl> 138, 132, 135, 132, 138, 140, 134, 135, 129, 137, 141, 13…
$ potas        <dbl> 3.6, 4.1, 4.7, 3.8, 3.8, 3.0, 4.1, 4.0, 4.0, 3.7, 4.1, 4.…
$ gluc         <dbl> NA, 5.1, NA, NA, NA, NA, NA, NA, NA, 5.5, NA, NA, NA, NA,…
$ cbs          <dbl> 11.4, NA, 18.6, 6.8, 6.5, 8.4, 13.4, 14.9, 6.6, 5.8, 7.7,…
$ chol         <dbl> NA, 4.7, NA, NA, NA, NA, NA, NA, NA, 6.9, NA, NA, NA, NA,…
$ tg           <dbl> NA, 1.1, NA, NA, NA, NA, NA, NA, NA, 1.0, NA, NA, NA, NA,…
$ urea         <dbl> 5.1, 8.4, 11.0, 7.4, 3.8, 4.0, 3.3, 5.6, 5.8, 5.8, 7.3, 7…
$ referral     <dbl> 4, 4, 4, 3, 1, 2, 4, 3, 3, 4, 1, 4, 1, 3, 4, 3, 3, 1, 4, …
$ transport    <dbl> 2, 2, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, …
$ age2         <dbl> 69, 64, 63, 67, 66, 78, 55, 65, 67, 56, 50, 78, 59, 83, 8…
$ sex2         <fct> female, male, female, female, male, female, female, femal…
$ marital2     <fct> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, yes, 4, 4, 4, 3, yes,…
$ status1b     <fct> dead, alive, alive, alive, alive, dead, alive, dead, aliv…
$ status2b     <fct> dead, alive, alive, alive, alive, dead, alive, dead, aliv…
$ status3b     <fct> dead, alive, alive, alive, alive, dead, alive, dead, aliv…
$ hpt2         <fct> yes, yes, yes, yes, yes, no, yes, yes, yes, no, yes, no, …
$ marital3     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, …
$ race2        <fct> yes, yes, yes, yes, yes, no, yes, yes, yes, yes, yes, yes…
$ sex3         <fct> female, male, female, female, male, female, female, femal…
$ referral2    <dbl> 3, 3, 3, 2, 0, 1, 3, 2, 2, 3, 0, 3, 0, 2, 3, 2, 2, 0, 3, …
$ referral2cat <fct> GP/Home/Missing, GP/Home/Missing, GP/Home/Missing, hospit…
$ icd10cat     <fct> "CI,Others", "CI,Others", "Haemorrhagic", "Haemorrhagic",…
$ icd10cat2    <fct> "CI,Others", "CI,Others", "Haemorrhagic", "Haemorrhagic",…
$ icd10cat3    <fct> "CI,Others", "CI,Others", "ICB, Other Haemorrhage", "ICB,…
$ dm2cat       <dbl> 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, …
$ hpt2cat      <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, …
$ dyslipid2cat <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

4 Select relevant variables

time:time to event

-status3b: status of the patient (dead or alive)
-gcs: Glasgow Coma Scale score
-age2:age in numerical
-sex3:male,female
-dm2cat: diabetes mellitus category(yes(1) or no(0))
-hpt2cat: hypertension category(yes(1) or no(0))
-dyslipid2cat: dyslipidemia category(yes(1) or no(0))
-icd10cat3: ICD-10 category of the stroke(CI and others,ICB,other hemorrhage)

data2 <- data2 %>% select(time, gcs, status3b, age2, sex3, dm2cat, icd10cat3, hpt2cat, dyslipid2cat)
data2
glimpse(data2)
Rows: 226
Columns: 9
$ time         <dbl> 4, 3, 16, 23, 5, 4, 22, 14, 4, 2, 22, 2, 43, 2, 4, 4, 3, …
$ gcs          <dbl> 15, 15, 15, 11, 15, 7, 5, 13, 15, 15, 10, 15, 14, 9, 15, …
$ status3b     <fct> dead, alive, alive, alive, alive, dead, alive, dead, aliv…
$ age2         <dbl> 69, 64, 63, 67, 66, 78, 55, 65, 67, 56, 50, 78, 59, 83, 8…
$ sex3         <fct> female, male, female, female, male, female, female, femal…
$ dm2cat       <dbl> 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, …
$ icd10cat3    <fct> "CI,Others", "CI,Others", "ICB, Other Haemorrhage", "ICB,…
$ hpt2cat      <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, …
$ dyslipid2cat <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

5 Handle missing values (gcs)

Missing values for the GCS variable were handled using mean imputation, where the average GCS value in the dataset was used to replace all missing entries. This simple method is appropriate when the proportion of missing data is small.

data2$gcs[is.na(data2$gcs)] <- mean(data2$gcs, na.rm = TRUE)
glimpse(data2)
Rows: 226
Columns: 9
$ time         <dbl> 4, 3, 16, 23, 5, 4, 22, 14, 4, 2, 22, 2, 43, 2, 4, 4, 3, …
$ gcs          <dbl> 15, 15, 15, 11, 15, 7, 5, 13, 15, 15, 10, 15, 14, 9, 15, …
$ status3b     <fct> dead, alive, alive, alive, alive, dead, alive, dead, aliv…
$ age2         <dbl> 69, 64, 63, 67, 66, 78, 55, 65, 67, 56, 50, 78, 59, 83, 8…
$ sex3         <fct> female, male, female, female, male, female, female, femal…
$ dm2cat       <dbl> 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, …
$ icd10cat3    <fct> "CI,Others", "CI,Others", "ICB, Other Haemorrhage", "ICB,…
$ hpt2cat      <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, …
$ dyslipid2cat <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
str(data2)
tibble [226 × 9] (S3: tbl_df/tbl/data.frame)
 $ time        : num [1:226] 4 3 16 23 5 4 22 14 4 2 ...
  ..- attr(*, "label")= chr "time"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ gcs         : num [1:226] 15 15 15 11 15 7 5 13 15 15 ...
  ..- attr(*, "label")= chr "earliest Glasgow Coma Scale"
  ..- attr(*, "format.stata")= chr "%2.0f"
 $ status3b    : Factor w/ 2 levels "alive","dead": 2 1 1 1 1 2 1 2 1 1 ...
  ..- attr(*, "label")= chr "status @discharge 1=dead, 0=alive"
 $ age2        : num [1:226] 69 64 63 67 66 78 55 65 67 56 ...
  ..- attr(*, "label")= chr "age in years"
  ..- attr(*, "format.stata")= chr "%10.0g"
 $ sex3        : Factor w/ 2 levels "female","male": 1 2 1 1 2 1 1 1 2 2 ...
  ..- attr(*, "label")= chr "male=1, female=0"
 $ dm2cat      : num [1:226] 1 0 1 0 0 0 1 1 0 0 ...
  ..- attr(*, "label")= chr "dm yes=1, no=0"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ icd10cat3   : Factor w/ 3 levels "CI,Others","SAH",..: 1 1 3 3 3 3 3 1 1 1 ...
  ..- attr(*, "label")= chr "0=Cerebral Isch or others, 1=SAH, 2=Haemorrhagic"
 $ hpt2cat     : num [1:226] 1 1 1 1 1 0 1 1 1 0 ...
  ..- attr(*, "label")= chr "hpt yes=1, no=0"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ dyslipid2cat: num [1:226] 0 0 1 0 0 0 0 0 0 0 ...
  ..- attr(*, "label")= chr "dyslipid yes=1, no=0"
  ..- attr(*, "format.stata")= chr "%9.0g"
 - attr(*, "label")= chr "Data file created by EpiData based on DEADALIVEHUSM2.REC"

6 Summary data

Explore data

library(gtsummary)
tbl_summary(data2)
Characteristic N = 2261
time 4 (2, 7)
earliest Glasgow Coma Scale 15.0 (10.0, 15.0)
status @discharge 1=dead, 0=alive
    alive 173 (77%)
    dead 53 (23%)
age in years 61 (52, 69)
male=1, female=0
    female 129 (57%)
    male 97 (43%)
dm yes=1, no=0 74 (33%)
0=Cerebral Isch or others, 1=SAH, 2=Haemorrhagic
    CI,Others 149 (66%)
    SAH 19 (8.4%)
    ICB, Other Haemorrhage 58 (26%)
hpt yes=1, no=0 148 (65%)
dyslipid yes=1, no=0 27 (12%)
1 Median (Q1, Q3); n (%)
tbl_summary(
  data2,
  by = status3b,
  label = list(
    sex3 ~ "Sex",
    dm2cat ~ "Diabetes Mellitus",
    hpt2cat ~ "Hypertension",
    dyslipid2cat ~ "Dyslipidemia",
    gcs ~ "Earliest Glasgow Coma Scale",
    age2 ~ "Age in Years",
    icd10cat3 ~ "Stroke Subtype"
  ),
  statistic = list(
    all_continuous() ~ "{mean} ({sd})",
    all_categorical() ~ "{n} ({p}%)"
  )
) %>%
  modify_header(label = "**Variable**") %>%
  modify_spanning_header(c("stat_1", "stat_2") ~ "**Status**") %>%
  modify_caption("Summary of Data by Status")
Summary of Data by Status
Variable
Status
alive
N = 1731
dead
N = 531
time 6 (7) 8 (9)
Earliest Glasgow Coma Scale 13.7 (2.7) 8.5 (4.0)
Age in Years 60 (14) 62 (15)
Sex

    female 91 (53%) 38 (72%)
    male 82 (47%) 15 (28%)
Diabetes Mellitus 60 (35%) 14 (26%)
Stroke Subtype

    CI,Others 132 (76%) 17 (32%)
    SAH 8 (4.6%) 11 (21%)
    ICB, Other Haemorrhage 33 (19%) 25 (47%)
Hypertension 113 (65%) 35 (66%)
Dyslipidemia 26 (15%) 1 (1.9%)
1 Mean (SD); n (%)

Among the patients, those who died tended to have lower Glasgow Coma Scale (GCS) scores at admission. Diabetes mellitus appeared more frequently in the group who died compared to those who survived. Hypertension was common in both groups, with only slight differences in proportion. The distribution of sex was generally similar between the two groups. Dyslipidemia showed a fairly even distribution as well. For stroke subtype, hemorrhagic and subarachnoid strokes appeared more often among those who died, while ischemic strokes were more frequent in the group who survived.

7 Kaplan Meier survival analysis

7.1 Kaplan Meier estimates

Kaplan-Meier survival analysis is a non-parametric survival analysis

KM1 <- survfit(Surv(time = time, event = status3b == 'dead') ~1, data = data2)
summary(KM1)
Call: survfit(formula = Surv(time = time, event = status3b == "dead") ~ 
    1, data = data2)

 time n.risk n.event survival std.err lower 95% CI upper 95% CI
    0    226       3    0.987 0.00761       0.9719        1.000
    1    221      10    0.942 0.01559       0.9120        0.973
    2    197       4    0.923 0.01797       0.8884        0.959
    3    169       4    0.901 0.02060       0.8616        0.942
    4    133       4    0.874 0.02403       0.8282        0.922
    5     92       5    0.827 0.03071       0.7685        0.889
    6     67       3    0.789 0.03601       0.7220        0.863
    7     58       4    0.735 0.04259       0.6561        0.823
    9     44       2    0.702 0.04675       0.6157        0.800
   10     38       1    0.683 0.04903       0.5935        0.786
   12     34       4    0.603 0.05742       0.5001        0.727
   14     25       2    0.555 0.06213       0.4452        0.691
   18     20       1    0.527 0.06492       0.4138        0.671
   22     16       1    0.494 0.06870       0.3761        0.649
   25     10       2    0.395 0.08321       0.2615        0.597
   28      6       1    0.329 0.09178       0.1907        0.569
   29      5       1    0.263 0.09413       0.1308        0.531
   41      3       1    0.176 0.09528       0.0606        0.509

The Kaplan-Meier analysis showed that the estimated survival probability at admission was 98.7%. By Day 5, survival declined to about 82.7%, and by Day 10, it was 68.3%. A more noticeable drop occurred after Day 12, with survival estimated at 60.3%, and it continued to decrease over time. By Day 29, the survival probability was around 26.3%, and at the end of follow-up (Day 41), it was 17.6%. The number of patients at risk decreased over time, especially after Day 25, leading to wider confidence intervals in the later estimates.

7.1.1 Plot the Kaplan-Meier survival curve

ggsurvplot(KM1, data = data2, risk.table = TRUE, pval = TRUE)

7.2 Kaplan-meire based on groups(sex)

KM2 <- survfit(Surv(time = time, event = status3b== 'dead') ~ sex3, 
                     type = "kaplan-meier", data = data2)
summary(KM2)
Call: survfit(formula = Surv(time = time, event = status3b == "dead") ~ 
    sex3, data = data2, type = "kaplan-meier")

                sex3=female 
 time n.risk n.event survival std.err lower 95% CI upper 95% CI
    0    129       3   0.9767  0.0133       0.9511        1.000
    1    126       6   0.9302  0.0224       0.8873        0.975
    2    112       4   0.8970  0.0271       0.8455        0.952
    3     98       2   0.8787  0.0295       0.8228        0.938
    4     80       4   0.8348  0.0352       0.7685        0.907
    5     60       4   0.7791  0.0425       0.7001        0.867
    6     40       1   0.7596  0.0457       0.6752        0.855
    7     37       3   0.6980  0.0541       0.5997        0.812
    9     29       1   0.6740  0.0573       0.5705        0.796
   12     24       3   0.5897  0.0677       0.4709        0.739
   14     18       1   0.5570  0.0714       0.4332        0.716
   18     15       1   0.5198  0.0757       0.3907        0.692
   25      8       2   0.3899  0.0978       0.2385        0.637
   28      4       1   0.2924  0.1118       0.1382        0.619
   29      3       1   0.1949  0.1090       0.0651        0.583
   41      2       1   0.0975  0.0879       0.0167        0.571

                sex3=male 
 time n.risk n.event survival std.err lower 95% CI upper 95% CI
    1     95       4    0.958  0.0206        0.918        0.999
    3     71       2    0.931  0.0275        0.879        0.986
    5     32       1    0.902  0.0391        0.828        0.982
    6     27       2    0.835  0.0581        0.729        0.957
    7     21       1    0.795  0.0676        0.673        0.939
    9     15       1    0.742  0.0813        0.599        0.920
   10     13       1    0.685  0.0929        0.525        0.894
   12     10       1    0.617  0.1059        0.440        0.863
   14      7       1    0.529  0.1220        0.336        0.831
   22      4       1    0.396  0.1465        0.192        0.818

7.2.1 plot survival curve for kaplan meier by group(sex)

ggsurvplot(KM2, data = data2, risk.table = TRUE, 
           linetype = c(1,2), pval = TRUE)

The Kaplan-Meier analysis showed that both male and female patients experienced a decline in survival over time. Initially, survival was high in both groups, with females at 97.7% and males at 95.8%. However, survival declined more noticeably among females, reaching 77.9% by Day 5 and dropping to 9.8% by Day 41. In comparison, males showed a slower decline, with survival at 90.2% by Day 5 and 39.6% by Day 22. Although survival was lower among females throughout most time points.

7.3 kaplan meier based on dm status

KM3 <- survfit(Surv(time = time, event = status3b== 'dead') ~ dm2cat, 
                     type = "kaplan-meier", data = data2)
summary(KM3)
Call: survfit(formula = Surv(time = time, event = status3b == "dead") ~ 
    dm2cat, data = data2, type = "kaplan-meier")

                dm2cat=0 
 time n.risk n.event survival std.err lower 95% CI upper 95% CI
    0    152       2    0.987 0.00924        0.969        1.000
    1    148       9    0.927 0.02124        0.886        0.969
    2    128       4    0.898 0.02503        0.850        0.948
    3    105       2    0.881 0.02732        0.829        0.936
    4     85       2    0.860 0.03035        0.803        0.922
    5     56       5    0.783 0.04287        0.704        0.872
    6     42       3    0.727 0.05054        0.635        0.833
    7     36       2    0.687 0.05522        0.587        0.804
    9     29       1    0.663 0.05817        0.558        0.788
   10     25       1    0.637 0.06160        0.527        0.770
   12     21       4    0.515 0.07391        0.389        0.683
   18     14       1    0.479 0.07726        0.349        0.657
   22     10       1    0.431 0.08304        0.295        0.629
   25      5       1    0.345 0.10174        0.193        0.615
   29      4       1    0.258 0.10672        0.115        0.581

                dm2cat=1 
 time n.risk n.event survival std.err lower 95% CI upper 95% CI
    0     74       1    0.986  0.0134       0.9605        1.000
    1     73       1    0.973  0.0189       0.9367        1.000
    3     64       2    0.943  0.0280       0.8893        0.999
    4     48       2    0.903  0.0382       0.8315        0.981
    7     22       2    0.821  0.0653       0.7026        0.960
    9     15       1    0.766  0.0807       0.6235        0.942
   14      9       2    0.596  0.1234       0.3973        0.894
   25      5       1    0.477  0.1453       0.2625        0.867
   28      2       1    0.238  0.1836       0.0527        1.000
   41      1       1    0.000     NaN           NA           NA

7.3.1 plot survival curve for kaplan meier by group(dm)

ggsurvplot(KM3, data = data2, risk.table = TRUE, 
           linetype = c(1,2), pval = TRUE)

The survival analysis by diabetic status showed that both groups started with high initial survival estimates (98.7% in non-diabetics vs. 98.6% in diabetics). However, over time, survival declined more rapidly among those with diabetes. By Day 10, survival was about 63.7% in non-diabetics compared to 76.6% in diabetics. However, by Day 25, the survival estimate for diabetics dropped to 47.7%, while non-diabetics had a lower estimate of 34.5%. By the end of follow-up (Day 41), survival among diabetics reached 0%, though this should be interpreted cautiously due to the very small number at risk and wide confidence intervals. Overall, both groups showed decreasing trends in survival, with a slightly more variable pattern in the diabetic group.

7.4 kaplan meier based on hypertension status

KM4 <- survfit(Surv(time = time, event = status3b== 'dead') ~ hpt2cat, 
                     type = "kaplan-meier", data = data2)
summary(KM4)
Call: survfit(formula = Surv(time = time, event = status3b == "dead") ~ 
    hpt2cat, data = data2, type = "kaplan-meier")

                hpt2cat=0 
 time n.risk n.event survival std.err lower 95% CI upper 95% CI
    0     78       1    0.987  0.0127       0.9625        1.000
    1     76       3    0.948  0.0252       0.9001        0.999
    2     65       2    0.919  0.0318       0.8588        0.983
    3     53       1    0.902  0.0356       0.8346        0.974
    4     42       1    0.880  0.0407       0.8039        0.964
    5     25       3    0.775  0.0675       0.6530        0.919
    6     20       1    0.736  0.0744       0.6036        0.897
    7     19       2    0.658  0.0844       0.5122        0.846
    9     14       1    0.611  0.0905       0.4574        0.817
   12     10       1    0.550  0.1000       0.3854        0.786
   18      8       1    0.481  0.1086       0.3094        0.749
   41      2       1    0.241  0.1787       0.0562        1.000

                hpt2cat=1 
 time n.risk n.event survival std.err lower 95% CI upper 95% CI
    0    148       2    0.986 0.00949       0.9681        1.000
    1    145       7    0.939 0.01975       0.9009        0.978
    2    132       2    0.925 0.02186       0.8828        0.968
    3    116       3    0.901 0.02528       0.8525        0.952
    4     91       3    0.871 0.02970       0.8147        0.931
    5     67       2    0.845 0.03403       0.7809        0.914
    6     47       2    0.809 0.04099       0.7326        0.894
    7     39       2    0.768 0.04826       0.6786        0.868
    9     30       1    0.742 0.05300       0.6451        0.854
   10     27       1    0.715 0.05773       0.6099        0.837
   12     24       3    0.625 0.06984       0.5023        0.778
   14     16       2    0.547 0.08004       0.4107        0.729
   22     12       1    0.501 0.08537       0.3592        0.700
   25      7       2    0.358 0.10512       0.2015        0.637
   28      3       1    0.239 0.12006       0.0891        0.640
   29      2       1    0.119 0.10359       0.0218        0.654

7.4.1 plot survival curve for kaplan meier by group(hypertension)

ggsurvplot(KM4, data = data2, risk.table = TRUE, 
           linetype = c(1,2), pval = TRUE)

Patients without hypertension had slightly better early survival, with survival decreasing from 98.7% at baseline to around 55% by day 12 and 24% by day 41. Among those with hypertension, survival also declined steadily from 98.6% at baseline to about 62.5% by day 12 and 11.9% by day 29. Overall, both groups showed declining survival, but the hypertensive group experienced a more pronounced drop earlier, especially after day 10, indicating a potential association between hypertension and poorer short-term survival. However, caution is needed when interpreting late survival estimates due to smaller sample sizes and wider confidence intervals.

8 Estimate the survival function

Using quantile() function to get the estimated survival duration for any percentile (with their 95% CI). For example, to get the value for survival duration at 25, 50 (median) and 75 percentile

8.1 overall

quantile(KM1, probs = c(0.25, 0.50, 0.75))
$quantile
25 50 75 
 7 22 41 

$lower
25 50 75 
 6 14 28 

$upper
25 50 75 
12 NA NA 

Based on the Kaplan-Meier estimate, 25% of patients died within 7 days (95% CI: 6 to 12), 50% died within 22 days (95% CI: 14 to NA), and 75% died within 41 days (95% CI: 28 to NA). ## sex

quantile(KM2, probs = c(0.25, 0.50, 0.75))
$quantile
            25 50 75
sex3=female  7 25 29
sex3=male    9 22 NA

$lower
            25 50 75
sex3=female  5 12 25
sex3=male    6 12 22

$upper
            25 50 75
sex3=female 12 NA NA
sex3=male   NA NA NA

Among females, 25% died within 7 days (95% CI: 5 to 12), 50% within 25 days (95% CI: 12 to NA), and 75% within 29 days (95% CI: 25 to NA). Among males, 25% died within 9 days (95% CI: 6 to NA), 50% within 22 days (95% CI: 12 to NA), while the 75th percentile was not estimable due to censoring. ## dm status

quantile(KM3, probs = c(0.25, 0.50, 0.75))
$quantile
         25 50 75
dm2cat=0  6 18 NA
dm2cat=1 14 25 28

$lower
         25 50 75
dm2cat=0  5 12 25
dm2cat=1  7 14 28

$upper
         25 50 75
dm2cat=0 12 NA NA
dm2cat=1 NA NA NA

Among patients without diabetes (DM???), 25% died within 6 days (95% CI: 5 to 12), 50% within 18 days (95% CI: 12 to NA), and the 75th percentile could not be estimated due to censoring. Among those with diabetes (DM+), 25% died within 14 days (95% CI: 7 to NA), 50% within 25 days (95% CI: 14 to NA), and 75% within 28 days (95% CI: 28 to NA). ## hpt status

quantile(KM4, probs = c(0.25, 0.50, 0.75))
$quantile
          25 50 75
hpt2cat=0  6 18 41
hpt2cat=1  9 25 28

$lower
          25 50 75
hpt2cat=0  5  9 41
hpt2cat=1  6 14 25

$upper
          25 50 75
hpt2cat=0 18 NA NA
hpt2cat=1 14 NA NA

Among patients without hypertension (HPT???), 25% died within 6 days (95% CI: 5 to 18), 50% within 18 days (95% CI: 9 to NA), and 75% within 41 days (95% CI: 41 to NA). Among those with hypertension (HPT+), 25% died within 9 days (95% CI: 6 to 14), 50% within 25 days (95% CI: 14 to NA), and 75% within 28 days (95% CI: 25 to NA).

9 Perform the log-rank test

Comparing the survival estimates between levels of a group (categorical) variable ## To test if the survival estimates differ between groups (male vs female)

logrank.sex <- survdiff(Surv(time = time, event = status3b == 'dead') ~ sex3,
                        data = data2, rho = 0)
logrank.sex
Call:
survdiff(formula = Surv(time = time, event = status3b == "dead") ~ 
    sex3, data = data2, rho = 0)

              N Observed Expected (O-E)^2/E (O-E)^2/V
sex3=female 129       38     33.3     0.666      1.89
sex3=male    97       15     19.7     1.126      1.89

 Chisq= 1.9  on 1 degrees of freedom, p= 0.2 

There is no statistically significant difference in survival between males and females (??? = 1.9, df = 1, p = 0.2). Even though females had slightly more observed deaths (38 vs 15), the difference is not significant (p > 0.05).

9.1 To test if the survival estimates differ between groups DM status (yes,no)

logrank.dm <- survdiff(Surv(time = time, event = status3b == 'dead') ~ dm2cat,
                        data = data2, rho = 0)
logrank.dm
Call:
survdiff(formula = Surv(time = time, event = status3b == "dead") ~ 
    dm2cat, data = data2, rho = 0)

           N Observed Expected (O-E)^2/E (O-E)^2/V
dm2cat=0 152       39     33.9     0.752      2.19
dm2cat=1  74       14     19.1     1.340      2.19

 Chisq= 2.2  on 1 degrees of freedom, p= 0.1 

There is no statistically significant difference in survival between patients with and without diabetes (DM2) (??? = 2.2, df = 1, p = 0.1). ## To test if the survival estimates differ between groups HPT status (yes,no)

logrank.hpt <- survdiff(Surv(time = time, event = status3b == 'dead') ~ hpt2cat,
                        data = data2, rho = 0)
logrank.hpt
Call:
survdiff(formula = Surv(time = time, event = status3b == "dead") ~ 
    hpt2cat, data = data2, rho = 0)

            N Observed Expected (O-E)^2/E (O-E)^2/V
hpt2cat=0  78       18     17.7   0.00458   0.00729
hpt2cat=1 148       35     35.3   0.00230   0.00729

 Chisq= 0  on 1 degrees of freedom, p= 0.9 

There is no significant difference in survival between patients with and without hypertension (??? = 0.0, df = 1, p = 0.9). # Cox proportional hazard (PH) regression

10 Univariable and Multivarible Cox PH regression

10.1 Model with no covariate

cox1  <- coxph(Surv(time = time, event = status3b == 'dead') ~ 1,
                 data = data2)
summary(cox1)
Call:  coxph(formula = Surv(time = time, event = status3b == "dead") ~ 
    1, data = data2)

Null model
  log likelihood= -228.599 
  n= 226 

the null model estimates the baseline hazard of death over time for the entire sample (n = 226), without considering any covariates. The log-likelihood = -228.599, which serves as a reference point for comparing with more complex models

10.2 Model with covariate(uniavariable)

10.2.1 gcs

cox.gcs <- coxph(Surv(time = time, event = status3b == 'dead') ~ gcs,
                 data = data2)
summary(cox.gcs)
Call:
coxph(formula = Surv(time = time, event = status3b == "dead") ~ 
    gcs, data = data2)

  n= 226, number of events= 53 

        coef exp(coef) se(coef)     z Pr(>|z|)    
gcs -0.18784   0.82875  0.03227 -5.82 5.89e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

    exp(coef) exp(-coef) lower .95 upper .95
gcs    0.8288      1.207    0.7779    0.8829

Concordance= 0.783  (se = 0.035 )
Likelihood ratio test= 34.14  on 1 df,   p=5e-09
Wald test            = 33.87  on 1 df,   p=6e-09
Score (logrank) test = 39.48  on 1 df,   p=3e-10

There is a significant association between GCS score and survival (p < 0.001). For each 1-point increase in GCS, the hazard of death decreases by 17% (HR = 0.825, 95% CI: 0.774-0.880). The concordance index is 0.785, indicating good predictive ability of the model. ### 2. dm ### 2. diabetes mellitus (DM2)

cox.dm <- coxph(Surv(time = time, event = status3b == 'dead') ~ dm2cat,
                 data = data2)
summary(cox.dm)
Call:
coxph(formula = Surv(time = time, event = status3b == "dead") ~ 
    dm2cat, data = data2)

  n= 226, number of events= 53 

          coef exp(coef) se(coef)      z Pr(>|z|)
dm2cat -0.4618    0.6302   0.3127 -1.477     0.14

       exp(coef) exp(-coef) lower .95 upper .95
dm2cat    0.6302      1.587    0.3414     1.163

Concordance= 0.574  (se = 0.035 )
Likelihood ratio test= 2.33  on 1 df,   p=0.1
Wald test            = 2.18  on 1 df,   p=0.1
Score (logrank) test = 2.22  on 1 df,   p=0.1

There is no significant association between diabetes status (DM2) and survival (p = 0.14). Although those with DM2 had a 37% lower hazard of death (HR = 0.63), the confidence interval (0.34-1.16) crosses 1, indicating the result is not statistically significant. ### 3. hpt ### 3. hypertension (HPT2)

cox.hpt <- coxph(Surv(time = time, event = status3b == 'dead') ~ hpt2cat,
                 data = data2)
summary(cox.hpt)
Call:
coxph(formula = Surv(time = time, event = status3b == "dead") ~ 
    hpt2cat, data = data2)

  n= 226, number of events= 53 

            coef exp(coef) se(coef)      z Pr(>|z|)
hpt2cat -0.02418   0.97611  0.29336 -0.082    0.934

        exp(coef) exp(-coef) lower .95 upper .95
hpt2cat    0.9761      1.024    0.5493     1.735

Concordance= 0.511  (se = 0.041 )
Likelihood ratio test= 0.01  on 1 df,   p=0.9
Wald test            = 0.01  on 1 df,   p=0.9
Score (logrank) test = 0.01  on 1 df,   p=0.9
cox.icd10 <- coxph(Surv(time = time, event = status3b == 'dead') ~ icd10cat3,
                 data = data2)
summary(cox.icd10)
Call:
coxph(formula = Surv(time = time, event = status3b == "dead") ~ 
    icd10cat3, data = data2)

  n= 226, number of events= 53 

                                  coef exp(coef) se(coef)     z Pr(>|z|)  
icd10cat3SAH                    0.8160    2.2614   0.4093 1.993   0.0462 *
icd10cat3ICB, Other Haemorrhage 0.7833    2.1886   0.3256 2.406   0.0161 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                                exp(coef) exp(-coef) lower .95 upper .95
icd10cat3SAH                        2.261     0.4422     1.014     5.044
icd10cat3ICB, Other Haemorrhage     2.189     0.4569     1.156     4.143

Concordance= 0.65  (se = 0.043 )
Likelihood ratio test= 6.96  on 2 df,   p=0.03
Wald test            = 6.58  on 2 df,   p=0.04
Score (logrank) test = 6.83  on 2 df,   p=0.03

There is no significant association between hypertension status and survival (p = 0.934). The hazard ratio is 0.98 (95% CI: 0.55-1.74), indicating no meaningful difference in risk of death between those with and without hypertension.

10.2.2 Main effect models(multivariable)

choose variables that are clinically relevant and statistically significant in univariable analysis

cox.main <- coxph(
  Surv(time = time, event = status3b == 'dead') ~ gcs + age2  + dm2cat + icd10cat3 + hpt2cat +dyslipid2cat,
  data = data2
)

summary(cox.main)
Call:
coxph(formula = Surv(time = time, event = status3b == "dead") ~ 
    gcs + age2 + dm2cat + icd10cat3 + hpt2cat + dyslipid2cat, 
    data = data2)

  n= 226, number of events= 53 

                                    coef exp(coef) se(coef)      z Pr(>|z|)    
gcs                             -0.18757   0.82897  0.03537 -5.303 1.14e-07 ***
age2                             0.03158   1.03208  0.01125  2.807    0.005 ** 
dm2cat                          -0.30646   0.73605  0.34425 -0.890    0.373    
icd10cat3SAH                     0.28057   1.32389  0.43707  0.642    0.521    
icd10cat3ICB, Other Haemorrhage  0.39057   1.47782  0.34024  1.148    0.251    
hpt2cat                         -0.12839   0.87951  0.32687 -0.393    0.694    
dyslipid2cat                    -0.48336   0.61671  1.04338 -0.463    0.643    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                                exp(coef) exp(-coef) lower .95 upper .95
gcs                                0.8290     1.2063   0.77344    0.8885
age2                               1.0321     0.9689   1.00957    1.0551
dm2cat                             0.7360     1.3586   0.37487    1.4452
icd10cat3SAH                       1.3239     0.7554   0.56210    3.1181
icd10cat3ICB, Other Haemorrhage    1.4778     0.6767   0.75859    2.8790
hpt2cat                            0.8795     1.1370   0.46346    1.6691
dyslipid2cat                       0.6167     1.6215   0.07979    4.7666

Concordance= 0.818  (se = 0.028 )
Likelihood ratio test= 45.48  on 7 df,   p=1e-07
Wald test            = 42.05  on 7 df,   p=5e-07
Score (logrank) test = 50.17  on 7 df,   p=1e-08

In this model adjusting for multiple variables, lower GCS and older age were significantly associated with increased risk of death. Each unit increase in GCS reduced the hazard of death by 17% (HR = 0.83, p < 0.001). Each year increase in age increased the hazard by 3% (HR = 1.03, p = 0.008). Other variables-including diabetes, hypertension, dyslipidemia, and diagnosis type-were not significantly associated with survival.

10.2.3 Model with interaction (gcs:age)

cox.interaction <- coxph(
  Surv(time = time, event = status3b == 'dead') ~ gcs + age2 + dm2cat + icd10cat3 + hpt2cat + dyslipid2cat + 
    gcs:age2,
  data = data2
)

summary(cox.interaction)
Call:
coxph(formula = Surv(time = time, event = status3b == "dead") ~ 
    gcs + age2 + dm2cat + icd10cat3 + hpt2cat + dyslipid2cat + 
        gcs:age2, data = data2)

  n= 226, number of events= 53 

                                     coef exp(coef)  se(coef)      z Pr(>|z|)  
gcs                             -0.338425  0.712892  0.179009 -1.891   0.0587 .
age2                             0.011057  1.011119  0.026241  0.421   0.6735  
dm2cat                          -0.276960  0.758085  0.346927 -0.798   0.4247  
icd10cat3SAH                     0.292616  1.339928  0.442535  0.661   0.5085  
icd10cat3ICB, Other Haemorrhage  0.408583  1.504684  0.346992  1.177   0.2390  
hpt2cat                         -0.108935  0.896788  0.327946 -0.332   0.7398  
dyslipid2cat                    -0.505265  0.603346  1.042383 -0.485   0.6279  
gcs:age2                         0.002443  1.002446  0.002830  0.863   0.3881  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                                exp(coef) exp(-coef) lower .95 upper .95
gcs                                0.7129     1.4027   0.50194     1.013
age2                               1.0111     0.9890   0.96043     1.064
dm2cat                             0.7581     1.3191   0.38407     1.496
icd10cat3SAH                       1.3399     0.7463   0.56285     3.190
icd10cat3ICB, Other Haemorrhage    1.5047     0.6646   0.76223     2.970
hpt2cat                            0.8968     1.1151   0.47157     1.705
dyslipid2cat                       0.6033     1.6574   0.07821     4.654
gcs:age2                           1.0024     0.9976   0.99690     1.008

Concordance= 0.816  (se = 0.028 )
Likelihood ratio test= 46.23  on 8 df,   p=2e-07
Wald test            = 39.58  on 8 df,   p=4e-06
Score (logrank) test = 51.44  on 8 df,   p=2e-08

In the multivariable Cox model that includes the interaction between GCS and age, only GCS remained statistically significant. A one-point increase in GCS was associated with a 30% lower risk of death (HR = 0.70, 95% CI: 0.49-1.00, p = 0.048). The interaction term GCS and age was not significant (p = 0.332), indicating that the effect of GCS on survival did not vary significantly by age. All other variables - including age, diabetes, hypertension, dyslipidemia, and diagnosis category - were also not significantly associated with mortality. The model showed good discrimination with a concordance of 0.819, suggesting it predicts outcomes reliably.

10.3 Models comparison

anova(cox.main, cox.interaction, test = 'Chisq')

The interaction between GCS and age was not statistically significant, so the simpler model without the interaction is preferred

11 Model checking

11.1 Linearity in hazard assumption

Check only for numerical covariates

data2$status_event <- ifelse(data2$status3b == "dead", 1, 0)


data2_clean <- na.omit(data2[, c("time", "status_event", "gcs", "age2")])


data2_clean$gcs <- as.numeric(data2_clean$gcs)
data2_clean$age2 <- as.numeric(data2_clean$age2)

fit <- coxph(Surv(time = time, event = status_event) ~ gcs + age2, data = data2_clean)


ggcoxfunctional(fit = fit, data = data2_clean)

11.2 Proportional hazard assumption

11.2.1 The km method

prop.h <- cox.zph(cox.main, transform = 'km', global = TRUE)
prop.h
                chisq df    p
gcs          0.000399  1 0.98
age2         0.932115  1 0.33
dm2cat       2.692377  1 0.10
icd10cat3    4.217580  2 0.12
hpt2cat      0.011412  1 0.91
dyslipid2cat 0.487945  1 0.48
GLOBAL       6.195032  7 0.52

The test for the proportional hazards assumption shows that all variables in the model, including GCS, age, diabetes, diagnosis, hypertension, and dyslipidemia, have p-values greater than 0.05. This indicates that there is no significant evidence of violation for any individual variable. The global test also shows a p-value of 0.591, suggesting that the overall model satisfies the proportional hazards assumption. Therefore, the Cox regression model is appropriate for this data.

ggcoxzph(prop.h)

to look each clearly

plot(prop.h)

11.2.2 The rank method

prop.h.r <- cox.zph(cox.main, transform = 'rank')
prop.h.r
             chisq df    p
gcs          2.039  1 0.15
age2         0.464  1 0.50
dm2cat       0.897  1 0.34
icd10cat3    3.316  2 0.19
hpt2cat      0.412  1 0.52
dyslipid2cat 0.628  1 0.43
GLOBAL       7.549  7 0.37
ggcoxzph(prop.h.r)

to look clearly

plot(prop.h.r)

The global Schoenfeld test gave a p-value of 0.5915, suggesting that the proportional hazards (PH) assumption is not violated. In the residual plots, all covariates show smooth curves that are approximately flat, and most confidence bands include zero. There is no strong evidence of time-dependent effects. Thus, we can conclude that the Cox model is appropriate, and the PH assumption holds for all included variables.`

#Model diagnostics Prediction use expand.grid() function to create a dataframe

cox.main.h <- tidy(cox.main, conf.int = TRUE)
cox.main.hr <- tidy(cox.main, exponentiate = TRUE, conf.int = TRUE)
bind_cols(cox.main.h, cox.main.hr)

create a new data frame for the purpose of prediction

new_data <- expand.grid(
  gcs = c(5, 10, 12),
  age2 = c(40, 50, 60),
  dm2cat = c("no", "yes"),
  icd10cat3 = c("CI,Others", "SAH", "ICB, Other Haemorrhage"),
  hpt2cat = c("no", "yes"),
  dyslipid2cat = c("no", "yes")
)

new_data
new_data

The linear predictor We will use main effect models cox.mv

predict(cox.main, newdata = new_data, type = 'lp')
           1            2            3            4            5            6 
 0.737148928 -0.200722487 -0.575871052  1.052939061  0.115067646 -0.260080919 
           7            8            9           10           11           12 
 1.368729194  0.430857780  0.055709214  0.430691343 -0.507180072 -0.882328638 
          13           14           15           16           17           18 
 0.746481476 -0.191389939 -0.566538505  1.062271609  0.124400194 -0.250748372 
          19           20           21           22           23           24 
 1.017721695  0.079850281 -0.295298285  1.333511828  0.395640414  0.020491848 
          25           26           27           28           29           30 
 1.649301961  0.711430547  0.336281981  0.711264110 -0.226607305 -0.601755871 
          31           32           33           34           35           36 
 1.027054243  0.089182828 -0.285965738  1.342844376  0.404972961  0.029824395 
          37           38           39           40           41           42 
 1.127719059  0.189847644 -0.185300922  1.443509192  0.505637777  0.130489211 
          43           44           45           46           47           48 
 1.759299325  0.821427910  0.446279344  0.821261473 -0.116609941 -0.491758507 
          49           50           51           52           53           54 
 1.137051606  0.199180192 -0.175968374  1.452841739  0.514970325  0.139821759 
          55           56           57           58           59           60 
 0.608756801 -0.329114614 -0.704263180  0.924546934 -0.013324481 -0.388473047 
          61           62           63           64           65           66 
 1.240337067  0.302465652 -0.072682914  0.302299215 -0.635572199 -1.010720765 
          67           68           69           70           71           72 
 0.618089348 -0.319782066 -0.694930632  0.933879481 -0.003991933 -0.379140499 
          73           74           75           76           77           78 
 0.889329568 -0.048541847 -0.423690413  1.205119701  0.267248286 -0.107900280 
          79           80           81           82           83           84 
 1.520909834  0.583038419  0.207889853  0.582871982 -0.354999432 -0.730147998 
          85           86           87           88           89           90 
 0.898662115 -0.039209299 -0.414357865  1.214452249  0.276580834 -0.098567732 
          91           92           93           94           95           96 
 0.999326931  0.061455516 -0.313693050  1.315117064  0.377245649  0.002097083 
          97           98           99          100          101          102 
 1.630907197  0.693035782  0.317887217  0.692869346 -0.245002069 -0.620150635 
         103          104          105          106          107          108 
 1.008659479  0.070788064 -0.304360502  1.324449612  0.386578197  0.011429631 
         109          110          111          112          113          114 
 0.253791943 -0.684079471 -1.059228037  0.569582076 -0.368289338 -0.743437904 
         115          116          117          118          119          120 
 0.885372209 -0.052499205 -0.427647771 -0.052665642 -0.990537057 -1.365685623 
         121          122          123          124          125          126 
 0.263124491 -0.674746924 -1.049895490  0.578914624 -0.358956791 -0.734105357 
         127          128          129          130          131          132 
 0.534364710 -0.403506704 -0.778655270  0.850154844 -0.087716571 -0.462865137 
         133          134          135          136          137          138 
 1.165944977  0.228073562 -0.147075004  0.227907125 -0.709964290 -1.085112856 
         139          140          141          142          143          144 
 0.543697258 -0.394174157 -0.769322723  0.859487391 -0.078384024 -0.453532590 
         145          146          147          148          149          150 
 0.644362074 -0.293509341 -0.668657907  0.960152207  0.022280792 -0.352867774 
         151          152          153          154          155          156 
 1.275942340  0.338070925 -0.037077641  0.337904489 -0.599966926 -0.975115492 
         157          158          159          160          161          162 
 0.653694622 -0.284176793 -0.659325359  0.969484755  0.031613340 -0.343535226 
         163          164          165          166          167          168 
 0.125399816 -0.812471599 -1.187620165  0.441189949 -0.496681466 -0.871830032 
         169          170          171          172          173          174 
 0.756980082 -0.180891333 -0.556039899 -0.181057770 -1.118929184 -1.494077750 
         175          176          177          178          179          180 
 0.134732363 -0.803139051 -1.178287617  0.450522497 -0.487348918 -0.862497484 
         181          182          183          184          185          186 
 0.405972583 -0.531898832 -0.907047398  0.721762716 -0.216108699 -0.591257265 
         187          188          189          190          191          192 
 1.037552849  0.099681434 -0.275467132  0.099514998 -0.838356417 -1.213504983 
         193          194          195          196          197          198 
 0.415305131 -0.522566284 -0.897714850  0.731095264 -0.206776151 -0.581924717 
         199          200          201          202          203          204 
 0.515969946 -0.421901469 -0.797050034  0.831760079 -0.106111335 -0.481259901 
         205          206          207          208          209          210 
 1.147550212  0.209678798 -0.165469768  0.209512361 -0.728359054 -1.103507620 
         211          212          213          214          215          216 
 0.525302494 -0.412568921 -0.787717487  0.841092627 -0.096778788 -0.471927354 
augment(cox.main, newdata = new_data)

This calculates the relative risk (Hazard Ratio) of created populations against the population sample (population) average, which are • mean gcs = 12.02 • mean age = 60.75 • proportion of diabetes = 38.5 percent • proportion of hypertension = 65.3 percent The first observation shows that this population has • gcs = 5 • age = 40 • no diabetes • no hypertension • no dyslipidemia

predict(cox.main, newdata = new_data, type = 'risk')
        1         2         3         4         5         6         7         8 
2.0899684 0.8181394 0.5622149 2.8660623 1.1219493 0.7709892 3.9303528 1.5385767 
        9        10        11        12        13        14        15        16 
1.0572902 1.5383207 0.6021913 0.4138182 2.1095644 0.8258105 0.5674864 2.8929351 
       17        18        19        20        21        22        23        24 
1.1324690 0.7782182 2.7668838 1.0831249 0.7443095 3.7943451 1.4853351 1.0207032 
       25        26        27        28        29        30        31        32 
5.2033464 2.0369031 1.3997337 2.0365641 0.7972338 0.5478488 2.7928267 1.0932805 
       33        34        35        36        37        38        39        40 
0.7512884 3.8299218 1.4992620 1.0302736 3.0886035 1.2090654 0.8308542 4.2355331 
       41        42        43        44        45        46        47        48 
1.6580426 1.1393856 5.8083662 2.2737442 1.5624879 2.2733658 0.8899323 0.6115500 
       49        50        51        52        53        54        55        56 
3.1175630 1.2204019 0.8386445 4.2752464 1.6735888 1.1500688 1.8381448 0.7195605 
       57        58        59        60        61        62        63        64 
0.4944728 2.5207259 0.9867639 0.6780915 3.4567784 1.3531912 0.9298956 1.3529660 
       65        66        67        68        69        70        71        72 
0.5296323 0.3639566 1.8553797 0.7263073 0.4991091 2.5443609 0.9960160 0.6844494 
       73        74        75        76        77        78        79        80 
2.4334976 0.9526175 0.6546265 3.3371585 1.3063648 0.8977171 4.5763871 1.7914734 
       81        82        83        84        85        86        87        88 
1.2310776 1.7911753 0.7011738 0.4818377 2.4563146 0.9615494 0.6607644 3.3684485 
       89        90        91        92        93        94        95        96 
1.3186135 0.9061343 2.7164529 1.0633832 0.7307433 3.7251870 1.4582625 1.0020993 
       97        98        99       100       101       102       103       104 
5.1085070 1.9997772 1.3742213 1.9994444 0.7827029 0.5378634 2.7419229 1.0733537 
      105       106       107       108       109       110       111       112 
0.7375949 3.7601153 1.4719355 1.0114952 1.2889036 0.5045545 0.3467234 1.7675282 
      113       114       115       116       117       118       119       120 
0.6919170 0.4754765 2.4238864 0.9488551 0.6520410 0.9486972 0.3713772 0.2552056 
      121       122       123       124       125       126       127       128 
1.3009887 0.5092853 0.3499743 1.7841010 0.6984045 0.4799346 1.7063639 0.6679735 
      129       130       131       132       133       134       135       136 
0.4590229 2.3400092 0.9160205 0.6294775 3.2089538 1.2561777 0.8632292 1.2559687 
      137       138       139       140       141       142       143       144 
0.4916618 0.3378637 1.7223631 0.6742366 0.4633268 2.3619496 0.9246093 0.6353796 
      145       146       147       148       149       150       151       152 
1.9047715 0.7456423 0.5123958 2.6120940 1.0225309 0.7026701 3.5820754 1.4022400 
      153       154       155       156       157       158       159       160 
0.9636013 1.4020066 0.5488298 0.3771488 1.9226311 0.7526336 0.5172001 2.6365856 
      161       162       163       164       165       166       167       168 
1.0321183 0.7092585 1.1336016 0.4437599 0.3049461 1.5545560 0.6085468 0.4181856 
      169       170       171       172       173       174       175       176 
2.1318285 0.8345260 0.5734756 0.8343872 0.3266294 0.2244555 1.1442305 0.4479207 
      177       178       179       180       181       182       183       184 
0.3078054 1.5691318 0.6142527 0.4221066 1.5007614 0.5874884 0.4037145 2.0580578 
      185       186       187       188       189       190       191       192 
0.8056477 0.5536308 2.8223020 1.1048189 0.7592174 1.1046350 0.4324207 0.2971539 
      193       194       195       196       197       198       199       200 
1.5148329 0.5929968 0.4074998 2.0773546 0.8132017 0.5588218 1.6752626 0.6557987 
      201       202       203       204       205       206       207       208 
0.4506564 2.2973587 0.8993245 0.6180043 3.1504655 1.2332819 0.8474955 1.2330766 
      209       210       211       212       213       214       215       216 
0.4827004 0.3317055 1.6909703 0.6619476 0.4548819 2.3188993 0.9077568 0.6237988 

This population has 2.92 times higher risk for death (the event) compared to the average population

For this, we need to add variable event and time

new_data2 <- expand.grid(status3b = 'dead', time = c(5, 20, 50))
new_data2

And combine with the previous data frame

new_data3 <- data.frame(new_data, new_data2)
head(new_data3)

And the predicted number of events are

pred.exp <- predict(cox.main, newdata = new_data3, type = 'expected')
pred.exp
  [1] 0.2818429 0.3026275 0.6673289 0.3865032 0.4150060 0.9151365 0.5300282
  [8] 0.5691153 1.2549655 0.2074504 0.2227489 0.4911873 0.2844855 0.3054650
 [15] 0.6735860 0.3901271 0.4188972 0.9237170 0.3731284 0.4006449 0.8834687
 [22] 0.5116868 0.5494214 1.2115381 0.7016979 0.7534448 1.6614336 0.2746411
 [29] 0.2948946 0.6502769 0.3766270 0.4044014 0.8917523 0.5164845 0.5545729
 [36] 1.2228978 0.4165140 0.4472299 0.9861941 0.5711833 0.6133054 1.3524098
 [43] 0.7832879 0.8410517 1.8546170 0.3065750 0.3291835 0.7258879 0.4204193
 [50] 0.4514233 0.9954409 0.5765388 0.6190559 1.3650904 0.2478832 0.2661635
 [57] 0.5869214 0.3399328 0.3650012 0.8048703 0.4661642 0.5005417 1.1037527
 [64] 0.1824544 0.1959095 0.4320034 0.2502074 0.2686591 0.5924245 0.3431201
 [71] 0.3684236 0.8124169 0.3281696 0.3523706 0.7770182 0.4500329 0.4832207
 [78] 1.0655580 0.6171492 0.6626611 1.4612449 0.2415491 0.2593623 0.5719240
 [85] 0.3312466 0.3556745 0.7843037 0.4542525 0.4877515 1.0755489 0.3663276
 [92] 0.3933425 0.8673661 0.5023605 0.5394073 1.1894559 0.6889083 0.7397121
 [99] 1.6311513 0.2696353 0.2895196 0.6384246 0.3697623 0.3970306 0.8754987
[106] 0.5070708 0.5444649 1.2006085 0.1738152 0.1866333 0.4115482 0.2383602
[113] 0.2559382 0.5643735 0.3268735 0.3509789 0.7739493 0.1279367 0.1373714
[120] 0.3029199 0.1754450 0.1883832 0.4154070 0.2405951 0.2583379 0.5696652
[127] 0.2301119 0.2470816 0.5448436 0.3155622 0.3388334 0.7471672 0.4327438
[134] 0.4646567 1.0246221 0.1693738 0.1818643 0.4010320 0.2322695 0.2493983
[141] 0.5499522 0.3185210 0.3420104 0.7541728 0.2568682 0.2758110 0.6081954
[148] 0.3522542 0.3782313 0.8340442 0.4830612 0.5186847 1.1437602 0.1890678
[155] 0.2030106 0.4476621 0.2592766 0.2783971 0.6138980 0.3555571 0.3817777
[162] 0.8418644 0.1528720 0.1641456 0.3619601 0.2096398 0.2250998 0.4963712
[169] 0.2874880 0.3086889 0.6806949 0.1125214 0.1208193 0.2664206 0.1543053
[176] 0.1656846 0.3653539 0.2116055 0.2272104 0.5010253 0.2023853 0.2173103
[183] 0.4791946 0.2775396 0.2980069 0.6571399 0.3806019 0.4086695 0.9011638
[190] 0.1489657 0.1599512 0.3527110 0.2042830 0.2193479 0.4836876 0.2801419
[197] 0.3008011 0.6633014 0.2259177 0.2425781 0.5349130 0.3098106 0.3326577
[204] 0.7335489 0.4248564 0.4561876 1.0059467 0.1662867 0.1785496 0.3937226
[211] 0.2280360 0.2448526 0.5399285 0.3127154 0.3357767 0.7404268
cbind(new_data3, pred.exp)

11.3 Residuals

use residuals to assess for model fitness residuals() can be calculated to produce martingale, deviance, score or Schoenfeld residuals for a Cox proportional hazards model. This give the score residuals for each predictor in the cox model

score.cox <- resid(cox.main, type = "score")
head(score.cox)
         gcs        age2      dm2cat icd10cat3SAH
1  5.9581437  6.34873344  0.73241824  -0.16202960
2 -0.2755716  0.04279784  0.01098818   0.00598501
3 -0.9473612 -0.24575977 -0.09772489   0.04020035
4 -2.5588815 -4.92734584  0.23315970   0.24454774
5 -0.7954379 -0.34040771  0.03135584   0.02062267
6 -0.5351756  7.08126442 -0.08154442  -0.08238960
  icd10cat3ICB, Other Haemorrhage     hpt2cat dyslipid2cat
1                     -0.43328668  0.36303845 -0.025078424
2                      0.02077705 -0.01800972  0.001510523
3                     -0.07660164 -0.04344618 -0.133362917
4                     -0.45574370 -0.25181268  0.010530776
5                     -0.07023712 -0.04707945  0.003539093
6                      0.20147616 -0.23493089 -0.006362353

11.3.1 Martingale residuals

marti.cox <- resid(cox.main, type = "martingale")
head(marti.cox)
          1           2           3           4           5           6 
 0.95359636 -0.04575586 -0.13535240 -0.83049007 -0.12759222  0.36881246 

11.3.2 Schoenfeld residuals

schoen.cox <- resid(cox.main, type = "schoenfeld")
head(schoen.cox)
        gcs       age2     dm2cat icd10cat3SAH icd10cat3ICB, Other Haemorrhage
0 -4.530969  -8.166289  0.7812731   -0.1205885                       0.5155407
0  2.469031 -11.166289 -0.2187269   -0.1205885                       0.5155407
0 -5.530969 -18.166289 -0.2187269   -0.1205885                       0.5155407
1  2.131857   2.208277 -0.2268867    0.8770547                      -0.4590436
1 -5.868143  -3.791723 -0.2268867   -0.1229453                      -0.4590436
1 -5.868143  16.208277 -0.2268867   -0.1229453                       0.5409564
     hpt2cat dyslipid2cat
0  0.4042782  -0.03773028
0 -0.5957218  -0.03773028
0  0.4042782  -0.03773028
1  0.4099536  -0.03564588
1 -0.5900464  -0.03564588
1  0.4099536  -0.03564588

11.3.3 Scaled Schoenfeld residuals

sschoen.cox <- resid(cox.main, type = "scaledsch")
head(sschoen.cox)
          gcs          age2     dm2cat icd10cat3SAH
0 -0.33810943 -0.0004075409  3.8789872    1.1837455
0 -0.00483645 -0.0204467512 -0.2136612    0.6274852
0 -0.45409143 -0.0955914580 -2.7718119   -0.1219364
1  0.10816445  0.0300911317 -1.9612192    8.0403905
1 -0.70033225  0.0234825921 -1.7265692   -5.2232366
1 -0.54168178  0.1356748588 -1.8018881    0.7746042
  icd10cat3ICB, Other Haemorrhage   hpt2cat dyslipid2cat
0                        3.399763  1.056932   -3.0325636
0                        2.439070 -2.206839   -2.0517529
0                        1.933477  3.221641   -0.4238138
1                        1.655905  3.426026   -3.4913985
1                       -4.478282 -3.722358    0.2176774
1                        2.781873  1.752185   -0.2008579

11.3.4 dfbeta

dfbeta.cox <- resid(cox.main, type = "dfbeta")
head(dfbeta.cox)
           [,1]          [,2]         [,3]         [,4]         [,5]
1  0.0070524795  4.423334e-04  0.075300997 -0.018109862 -0.026102351
2 -0.0002882357  4.813394e-05  0.002043072  0.001442933  0.002243795
3 -0.0005586845 -2.507572e-05 -0.006758294 -0.004828505 -0.012160733
4 -0.0031960441 -2.579148e-04  0.023767466 -0.005283831 -0.039582067
5 -0.0011096788  2.396637e-05  0.002766303 -0.005814632 -0.008471155
6 -0.0011483170  1.107047e-03  0.004844222 -0.004896750  0.013441408
           [,6]         [,7]
1  7.496927e-03 -0.106728454
2 -2.305172e-03  0.003421862
3  6.665546e-05 -0.135287884
4 -3.452977e-02  0.016429133
5 -7.332889e-03  0.007461845
6 -2.801988e-02  0.006508073

11.3.5 Residuals plots

Plot to identify the outliers using score residuals.

plot(data2$gcs, score.cox[,2], ylab="Score residuals")

plot(data2$dm2cat, score.cox[,1], ylab="Score residuals")

Plot to identify the outliers using martingale residuals.

plot(data2$hpt2cat, marti.cox, ylab = "Martingale residuals for sex")

plot(marti.cox, type = 'h', main = "Martingale residuals", ylab = "dfbetas",lwd = 2)

Using dfbetas, we can assess for the presence of influential subjects.

plot(data2$gcs, dfbeta.cox[,1], main = "Dfbetas for gcs", ylab = "dfbetas")

plot(data2$gcs, dfbeta.cox[,2], type = 'h',
main = "Dfbetas for bmi", ylab = "dfbetas",lwd = 2)

But you use the augment() function to do similar tasks as above. The resulting datasets consists of • the fitted variable • the std error of the fitted variable • the residuals

pred.cox.main <- augment(cox.main, data = data2)
pred.cox.main

11.3.6 Using deviance

ggcoxdiagnostics(cox.main, type = 'deviance', linear.predictions = FALSE)

11.3.7 Using martingale

ggcoxdiagnostics(cox.main, type = 'martingale', linear.predictions = FALSE)

11.3.8 Using dfbeta

ggcoxdiagnostics(cox.main, type = 'dfbeta', linear.predictions = FALSE)

https://github.com/thuzaifah/Advance-statisitical-assignment.git